
/*
Clean Safegraph Social Distancing data
*/

* ++++++++++++++++++++++
*** read in all the raw Social Distancing files
* extract key information and put in consistent format
* ++++++++++++++++++++++

* small program for creating average distance traveled variable
cap program drop calc_avg_dist_traveled
program define calc_avg_dist_traveled
	replace bucketed_distance_traveled = substr(bucketed_distance_traveled, 2, .)
	replace bucketed_distance_traveled = regexr(bucketed_distance_traveled, "}", "")
	split bucketed_distance_traveled, parse(",")
	forval a=1/7 {
		split bucketed_distance_traveled`a', parse(":")
		replace bucketed_distance_traveled`a'1 = substr(bucketed_distance_traveled`a'1, 2, .)
		replace bucketed_distance_traveled`a'1 = substr(bucketed_distance_traveled`a'1, 1, ///
			strlen(bucketed_distance_traveled`a'1)-1)
		replace bucketed_distance_traveled`a'2 = "0" if ///
			bucketed_distance_traveled`a'2 == ""
		split bucketed_distance_traveled`a'1, parse("-")
		replace bucketed_distance_traveled`a'11 = ///
			regexr(bucketed_distance_traveled`a'11, ">", "")
		replace bucketed_distance_traveled`a'12 = "0" if ///
			bucketed_distance_traveled`a'11 =="0" | bucketed_distance_traveled`a'11 ==""
		replace bucketed_distance_traveled`a'12 = "100000" ///
			if bucketed_distance_traveled`a'11 =="50000"
		destring bucketed_distance_traveled`a'2 bucketed_distance_traveled`a'12 ///
			bucketed_distance_traveled`a'11, replace force
		gen avg_bucket_`a' = (bucketed_distance_traveled`a'12 + ///
			bucketed_distance_traveled`a'11)/2
	}
	gen sum_devices = bucketed_distance_traveled12
	gen avg_distance_traveled = bucketed_distance_traveled12 * avg_bucket_1
	forval a=2/7 {
		replace sum_devices = sum_devices + bucketed_distance_traveled`a'2
		replace avg_distance_traveled = avg_distance_traveled + ///
			bucketed_distance_traveled`a'2 * avg_bucket_`a' if avg_bucket_`a' !=.
	}
	replace avg_distance_traveled = avg_distance_traveled / sum_devices
end

local j =1 
forval m = 1/7 {
	forval d = 1/31 { // not all files up to day 31 exist for all months (e.g. april only has 30 days)
	// --> confirm they do exist
		if `d' < 10 {
			local dpad = "0`d'"
		} 
		else {
			local dpad = "`d'"
		}
		cap confirm file "${data_raw_sd}/2020-0`m'-`dpad'-social-distancing.csv"
		di "${data_raw_sd}/2020-0`m'-`dpad'-social-distancing.csv"
		if !_rc {
			import delimited "${data_raw_sd}/2020-0`m'-`dpad'-social-distancing.csv", clear // read in files
			* create average distance traveled variable 
			calc_avg_dist_traveled
			keep origin_census_block_group completely_home_device_count ///
			device_count avg_distance_traveled sum_devices // keep key vars
			gen date = "2020-0`m'-`dpad'" // create date var
			tempfile part_`j'
			save `part_`j''
			local ++j 
		}
	}
}


* now combine all the individual files
local --j
use `part_1', clear
forval i = 2/`j' {
	append using `part_`i''
}

* save combination of all raw files
isid origin_census_block_group date
save "${data_derived_sd}/combine_raw_files.dta", replace

* ++++++++++++++++++++++
*** create zip level social distancing data
* at weekly level
* ++++++++++++++++++++++

* zip to tract crosswalk --> we need this to collapse from census block group level to zips
import excel using "${data_raw}/HUD-zip-tract/ZIP_TRACT_122019.xlsx", clear firstrow

* clean file a bit 
rename (ZIP TRACT RES_RATIO ) (zip tract res_ratio)
keep zip tract res_ratio
tempfile cw_tract_zip
save `cw_tract_zip'

* combination of all social distancing raw files as generated above
use "${data_derived_sd}/combine_raw_files.dta", clear

* create tract variable
gen block_str = string(origin_census_block_group, "%012.0f")
gen tract = substr(block_str, 1, 11)

* collapse to tract by date level 
collapse (rawsum) device_count completely_home_device_count sum_devices ///
	(mean) avg_distance_traveled [w=sum_devices], by(date tract)

* merge on zips --> we have many obs for each zip in master data (because of date dimension)
* and also have multiple obs per zip in using data because some zips fall into
* multiple tracts --> hence use joinby
joinby tract using `cw_tract_zip'

* assign devices (all and at home) to zips in proportion to which tracts fall into zips
replace device_count = device_count * res_ratio
replace sum_devices = sum_devices * res_ratio
replace completely_home_device_count = completely_home_device_count * res_ratio

* collapse to zip by date level 
collapse (rawsum) device_count completely_home_device_count sum_devices ///
	(mean) avg_distance_traveled [w=sum_devices], by(date zip)

* collapse to week level
gen daily_date = date(date , "YMD")
format daily_date %td
gen week = week(daily_date)
ren zip zcta
collapse (mean) device_count completely_home_device_count avg_distance_traveled ///
	[w=device_count], by(zcta week)

* save
destring zcta, replace
save "${data_derived_sd}/zip_soc_dist_weekly.dta", replace
	

